# Remove the following two lines if you want to use other tools for plotting.
library(ggplot2)  # you might have to install ggplot2 using `install.package(ggplot2)`. You can also use something else to plot.
library(ggfortify) # you might have to install ggfortify using `install.package(ggfortify)`. [autoplot](https://cran.r-project.org/web/packages/ggfortify/vignettes/plot_pca.html) might be usefule to look at the PCA results.

Archetype analysis of Darwin’s finches

Tasks

Follow the following steps to perform an archetype analysis of the finch data.

Reference solution

Load the data.

finches <- read.csv("~/Dev/r-studio-binder/data/finches.csv", header=TRUE)

Plot the data

PCA

pca <- princomp(finches, cor=TRUE, scores=TRUE) # cor=TRUE => scale & center

Plot the variance captured by the 5 principle componets.

pca$loadings
## 
## Loadings:
##                       Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## wing.length            0.459  0.110  0.364  0.772  0.221
## tarsus.length          0.449  0.234  0.639 -0.547 -0.190
## upper.mandible.length  0.428  0.668 -0.604              
## upper.mandible.depth   0.452 -0.445 -0.210 -0.300  0.681
## lower.mandible.width   0.447 -0.538 -0.224  0.108 -0.670
## 
##                Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## SS loadings       1.0    1.0    1.0    1.0    1.0
## Proportion Var    0.2    0.2    0.2    0.2    0.2
## Cumulative Var    0.2    0.4    0.6    0.8    1.0
autoplot(pca, loadings = TRUE, loadings.label = TRUE)

Compute the convex hull and highlight it in the plot

hull <- chull(pca_df$Comp.1, pca_df$Comp.2)
hull
##  [1]  10  80  75  94 114 113 117 127  62 130 125  60
pca_df <- data.frame(pca$scores)
ggplot(data=pca_df, aes(x=Comp.1, y=Comp.2)) +
  geom_polygon(data=pca_df[hull, ], fill='white') +
  geom_point(data=pca_df[hull, ], colour="red", size=5) +
  geom_point()

winner <- 0
area <- 0
for (i in hull){
  for (j in hull){
    for (k in hull)
      ax = pca_df$Comp.1[i]
      ay = pca_df$Comp.2[i]
      bx = pca_df$Comp.1[j]
      by = pca_df$Comp.2[j]
      cx = pca_df$Comp.1[k]
      cy = pca_df$Comp.2[k]
      area_new <- abs((ax * (by - cy) + bx * (cy - ay) + cx * (ay + by)) / 2)
      if (area_new >= area) {
        print(area_new)
        area <- area_new
        winner <- c(i, j, k)
      }
  }
}
## [1] 0.2920978
## [1] 1.078508
## [1] 1.763069
## [1] 5.193567
## [1] 7.029989
print("The three points in the convex hull that correspond to the triangle with largest area")
## [1] "The three points in the convex hull that correspond to the triangle with largest area"
print(winner)
## [1]  10 114  60
pca_df <- data.frame(pca$scores)
ggplot(data=pca_df, aes(x=Comp.1, y=Comp.2)) +
  geom_polygon(data=pca_df[hull, ], fill='white') +
  geom_point(data=pca_df[hull, ], colour="red", size=5) +
  geom_path(data=pca_df[append(winner, winner[1]), ], colour="cyan") + 
  geom_point()

Longevity

## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
df <- read.csv("data/szekely-2015-data.csv", header = TRUE)
df_numeric <- df %>%
  dplyr::select_if(is.numeric) %>%
  dplyr::select_if(~ !any(is.na(.)))

Including Plots

You can also embed plots, for example: